***************
* Title: gambia_ecd_edcc_table5.do
* Author: Todd Pugatch
* Description: replication code for Blimpo, Carneiro, Jervis, and Pugatch,
*	"Improving Access and Quality in Early Childhood Development Programs: 
*		Experimental Evidence from The Gambia"
*	for Economic Development and Cultural Change
* Inputs: ECD_3to6_Gambia_[cleanv1/sampleprep1].dta
* Outputs: 	gambia_ecd_edcc_table5.[txt/xls/out]
*			gambia_ecd_edcc_table5_irt.[xls/out]
* Notes: creates Table 5
****************
#delimit;
local start=`"$S_TIME"';
clear;
clear matrix;
clear mata;
graph drop _all;
cap log close;
set more off;
/*set directory:
	cd mydir
*/
local data=`"Data\cleaned"';
local output=`"analysis\output"';
log using analysis\output\gambia_ecd_edcc_table5.txt, text replace;

* LOAD AND PREPARE DATA;
qui use `data'\ECD_3to6_Gambia_cleanv1, clear;
/*NEED TO CHECK TREATMENT STATUS OF SETTLEMENT 37028. DISREGARD FOR NOW*/
qui drop if settlement_code==37028;

/*winsorize annual per-capita expenditure below 1st and above 99th percentiles*/
foreach x in expend_yr_hhpc expend_yr_hhpc_usd {;
	winsor `x', gen(`x'_wins) p(0.01);
};
qui gen ecd_wtppctw_baseline=ecd_wtpq_baseline*12/expend_yr_hhpc_wins;
lab var expend_yr_hhpc_wins "Annual HH expenditure per capita, GMD, winsorized at 1/99 percentiles";
lab var expend_yr_hhpc_usd_wins "Annual HH expenditure per capita, USD, winsorized at 1/99 percentiles";
lab var ecd_wtppctw_baseline "Amount willing to pay for ECD as shr of monthly HH pc expenditure (winsorized) at baseline";

* define 3 groups:
	--in baseline
	--in endline (original sample)
	--in endline (newly sampled);
qui gen in_baseline=(ip22!=3 & ip22!=.);	
qui gen in_endline=(interview_result==1);
qui gen in_endline_old=(in_endline==1 & in_baseline==1);
qui gen in_endline_new=(in_endline==1 & in_baseline==0);
/*treat unresolved gender mismatches as new to endline*/
qui replace in_endline_new=1 if in_endline_new==0 & child_gender_mismatch_resolved==0;
qui replace in_endline_old=. if in_endline_new==1;

* repeat for having valid MDAT fine motor and language/hearing scores;
/*note that "in sample" defined as having at least one valid MDAT score, not both as in previous analyses of attrition*/
qui gen in_baseline_mdat=(in_baseline==1 & (zfinemotor_baseline!=.|zlanghear_baseline!=.));
qui replace in_baseline_mdat=. if in_endline_new==1;
foreach x in endline endline_old endline_new {;
	qui gen in_`x'_mdat=(in_`x'==1 & (zfinemotor_endline!=.|zlanghear_endline!=.));
};
qui replace in_endline_old_mdat=. if in_endline_new==1;
qui gen in_endline_old_mdat_base=in_endline_old_mdat;
qui replace in_endline_old_mdat_base=. if in_baseline_mdat!=1; /*in baseline MDAT & in endline MDAT*/
	
* define indicators for treatment;
qui gen communitybased=(treatment==6);
qui gen purecontrol=(treatment==1);
qui gen ECDAnnex_control=(treatment==4);
qui gen ECDAnnex_treated=(treatment==5);

* ANALYSIS OF ENDLINE RESULTS;
* prep sample;
keep if (child_age_mths_dob>=36 & child_age_mths_dob<84 & in_baseline==1)|(child_age_mths_dob==. & in_baseline==0);

* keep if new to endline and would have been 3-6 at baseline (define as 4-8 at endline to allow errors);
keep if in_endline_new==0|(in_endline_new==1 & selected_child_age>=48 & selected_child_age<=96 & selected_child_age!=.)|
	(in_endline_new==1 & child_gender_mismatch_resolved==0);

qui drop if in_endline==0;

* normalize endline MDAT scores to combined control group distribution;
/*adjust for age and gender, following normalization of gambia_ecd_clean1.do*/
drop zfinemotor_adj_endline zlanghear_adj_endline;
foreach x in finemotor langhear {;
	qui xi: reg `x'_endline selected_child_age selected_child_age2 child_female 
		if purecontrol==1|ECDAnnex_control==1;
	qui predict e, residuals; 
	qui su e if purecontrol==1|ECDAnnex_control==1;
	qui gen z`x'_adj_endline=(e-r(mean))/r(sd);
	drop e;
};
lab var zfinemotor_adj_endline "MDAT fine motor endline z-score (age-adjusted)";
lab var zlanghear_adj_endline "MDAT language & hearing endline z-score (age-adjusted)";

/*code missing controls as 0 and include dummy for missing in regressions*/
local X "expend_yr_hhpc_usd_wins ecd_attend_baseline headhh_ag pc1";
foreach x in `X' {;
	qui gen `x'_a=`x';
	qui replace `x'_a=0 if `x'==.;
	qui gen `x'_m=(`x'==.);
	lab var `x'_a "`x', replacing missing with 0";
	lab var `x'_m "`x' missing";
};

* sample sizes (# of children and project sites);
qui egen site=tag(settlement_code);
table treatment, c(freq rawsum site);
table treatment, c(rawsum in_baseline_mdat rawsum in_endline_mdat rawsum in_endline_old_mdat rawsum in_endline_new_mdat);

/*ANALYSIS: need 3 basic pieces of information:
	1) unadjusted mean/sd by T/C
	2) difference between T/C, adjusting for regional stratification
	3) difference between T/C, with control for baseline outcome & 
		adjusting for regional stratification
	
	--repeat for both MDAT modules
	--repeat for both experiments
	--run version that controls for baseline imbalances */
/*define controls (add baseline outcome later)*/
local X "expend_yr_hhpc_usd_wins_a ecd_attend_baseline_a headhh_ag_a 
	pc1_a ecd_attend_baseline_m headhh_ag_m pc1_m expend_yr_hhpc_usd_wins_m";
	
/*unadjusted differences*/
local Y "zlanghear_adj_endline zfinemotor_adj_endline";
qui orth_out `Y' using `output'\gambia_ecd_edcc_table5.xls, 
	by(treatment) se vce(cluster settlement_code) count colnum 
	title("unadjusted means") replace;
	
/*treatment effects*/
local replace "replace";
local Y "zlanghear_adj zfinemotor_adj";
	
foreach y in `Y' {;
	/*COMMUNITY-BASED EXPERIMENT*/
	/*without control for baseline outcome*/
	qui areg `y'_endline communitybased if purecontrol==1|communitybased==1, 
		a(region2) cluster(settlement_code);
	outreg communitybased using `output'\gambia_ecd_edcc_table5, se 3aster nocons `replace';
	
	/*with control for baseline outcome*/
	qui replace `y'_baseline=. if in_baseline==0|in_endline_new==1;
	qui areg `y'_endline communitybased `y'_baseline if purecontrol==1|communitybased==1, 
		a(region2) cluster(settlement_code);
	outreg communitybased using `output'\gambia_ecd_edcc_table5, se 3aster nocons `replace';
	
	/*with control for baseline imbalances*/
	qui areg `y'_endline communitybased `X' if purecontrol==1|communitybased==1, 
		a(region2) cluster(settlement_code);
	outreg communitybased using `output'\gambia_ecd_edcc_table5, se 3aster nocons `replace';	
	
	/*with control for baseline imbalances & baseline outcome*/
	qui replace `y'_baseline=. if in_baseline==0|in_endline_new==1;
	qui areg `y'_endline communitybased `y'_baseline `X' if purecontrol==1|communitybased==1, 
		a(region2) cluster(settlement_code);
	outreg communitybased using `output'\gambia_ecd_edcc_table5, se 3aster nocons `replace';

	/*ECD ANNEX EXPERIMENT*/
	/*without control for baseline outcome*/
	qui areg `y'_endline ECDAnnex_treated if ECDAnnex_control==1|ECDAnnex_treated==1, 
		a(region2) cluster(settlement_code);
	outreg ECDAnnex_treated using `output'\gambia_ecd_edcc_table5, se 3aster nocons `replace';
	
	/*with control for baseline outcome*/
	qui replace `y'_baseline=. if in_baseline==0|in_endline_new==1;
	qui areg `y'_endline ECDAnnex_treated `y'_baseline if ECDAnnex_control==1|ECDAnnex_treated==1, 
		a(region2) cluster(settlement_code);
	outreg ECDAnnex_treated using `output'\gambia_ecd_edcc_table5, se 3aster nocons `replace';
	
	/*with control for baseline imbalances*/
	qui areg `y'_endline ECDAnnex_treated `X' if ECDAnnex_control==1|ECDAnnex_treated==1, 
		a(region2) cluster(settlement_code);
	outreg ECDAnnex_treated using `output'\gambia_ecd_edcc_table5, se 3aster nocons `replace';	
	
	/*with control for baseline imbalances & baseline outcome*/
	qui replace `y'_baseline=. if in_baseline==0|in_endline_new==1;
	qui areg `y'_endline ECDAnnex_treated `y'_baseline `X' if ECDAnnex_control==1|ECDAnnex_treated==1, 
		a(region2) cluster(settlement_code);
	outreg ECDAnnex_treated using `output'\gambia_ecd_edcc_table5, se 3aster nocons `replace';
};

/*RESULTS USING IRT SCORES*/
* LOAD AND PREPARE DATA;
* keep only children aged 3-6 with valid endline MDAT score;
qui use `data'\gambia_ecd_sampleprep1, clear;
qui keep if in_endline_mdat==1;

* normalize IRT scores to combined control group distribution;
/*adjust for age and gender, following normalization of gambia_ecd_clean1.do*/
qui gen selected_child_age2=selected_child_age^2;
foreach y in fm lh {;
	drop ztheta_`y'_endline*;
	qui xi: reg theta_`y'_endline selected_child_age selected_child_age2 child_female 
		if purecontrol==1|ECDAnnex_control==1;
	qui predict e, residuals; 
	qui su e if purecontrol==1|ECDAnnex_control==1;
	qui gen ztheta_`y'_endline=(e-r(mean))/r(sd);
	drop e;
	lab var ztheta_`y'_endline "MDAT `y' IRT score, endline (z), complete cases";
	
	forval i=0/1 {;
		qui xi: reg theta_`y'_endline_imp`i' selected_child_age selected_child_age2 child_female 
			if purecontrol==1|ECDAnnex_control==1;
		qui predict e, residuals; 
		qui su e if purecontrol==1|ECDAnnex_control==1;
		qui gen ztheta_`y'_endline_imp`i'=(e-r(mean))/r(sd);
		drop e;
		lab var ztheta_`y'_endline_imp`i' "MDAT `y' IRT score, endline (z), imputing `i' for missing items";
	};
};

* version of IRT scores top-coded at +/-3 s.d.;
foreach x in fm lh {;
	foreach w in endline {;
		qui gen ztheta_`x'_`w'_tc=ztheta_`x'_`w'_imp0;
		qui replace ztheta_`x'_`w'_tc=3 if ztheta_`x'_`w'_imp0>3 & ztheta_`x'_`w'_imp0!=.;
		qui replace ztheta_`x'_`w'_tc=-3 if ztheta_`x'_`w'_imp0<-3 & ztheta_`x'_`w'_imp0!=.;
		lab var ztheta_`x'_`w'_tc "ztheta_`x'_`w'_imp0, top-coded at +/-3";
	};
};
	
* find intraclass correlations within sites (Pedro's question);
foreach y in fm lh {;
	foreach w in baseline endline {;
		qui loneway ztheta_`y'_`w'_imp0 settlement_code if in_`w'_mdat==1;
		di "intraclass correlation, z`y' `w', IRT model (imputing 0 for missing items)="r(rho);
		qui loneway z`y'_`w'_imp0 settlement_code if in_`w'_mdat==1;
		di "intraclass correlation, z`y' `w', based on raw scores (imputing 0 for missing items)="r(rho);
	};
};

* what proportion of items are missing?;
qui gen fm_baseline_misspct=fm_baseline_miss/23 if in_baseline_mdat!=.;
qui gen lh_baseline_misspct=lh_baseline_miss/26 if in_baseline_mdat!=.;
qui gen mdat_baseline_misspct=(fm_baseline_miss+lh_baseline_miss)/49 if in_baseline_mdat!=.;
qui gen fm_endline_misspct=fm_endline_miss/16 if in_endline_mdat!=.;
qui gen lh_endline_misspct=lh_endline_miss/26 if in_endline_mdat!=.;
qui gen mdat_endline_misspct=(fm_endline_miss+lh_endline_miss)/42 if in_endline_mdat!=.;
foreach w in baseline endline {;
	lab var mdat_`w'_misspct "proportion missing items, MDAT `w'";
	foreach x in fm lh {;
		lab var `x'_`w'_misspct "proportion missing items, MDAT `x' `w'";
	};
};
su *misspct;

* what proportion of IRT scores are outliers (exceed |3|);
foreach w in baseline endline {;
	foreach x in fm lh {;
		qui gen `x'_`w'_tcflag=(abs(ztheta_`x'_`w')>3) if ztheta_`x'_`w'!=.;
		lab var `x'_`w'_tcflag "top-code flag (>|3|), MDAT `x' `w'";
	};
	qui egen mdat_`w'_tcflag=rowmax(fm_`w'_tcflag lh_`w'_tcflag);
	lab var mdat_`w'_tcflag "top-code flag (>|3|), , at least one MDAT score, `w'";
};
su *tcflag;

* sample sizes (# of children and project sites);
qui egen site=tag(settlement_code);
table treatment, c(freq rawsum site);
table treatment, c(rawsum in_baseline_mdat rawsum in_endline_mdat rawsum in_endline_old_mdat rawsum in_endline_new_mdat);

/*ANALYSIS: need 3 basic pieces of information:
	1) unadjusted mean/sd by T/C
	2) difference between T/C, adjusting for regional stratification
	3) difference between T/C, with control for baseline outcome & 
		adjusting for regional stratification
	
	--repeat for both MDAT modules
	--repeat for various versions of IRT scores
	--repeat for both experiments */
/*define controls (add baseline outcome later)*/

/*unadjusted differences*/
local Y "ztheta_lh_endline_imp0 ztheta_lh_endline_imp1 ztheta_lh_endline 
	ztheta_lh_endline_tc ztheta_fm_endline_imp0 ztheta_fm_endline_imp1 
	ztheta_fm_endline ztheta_fm_endline_tc";
qui orth_out `Y' using `output'\gambia_ecd_edcc_table5_irt.xls, 
	by(treatment) se vce(cluster settlement_code) count colnum 
	title("unadjusted means") replace;
	
/*treatment effects*/
local replace "replace";
	
foreach y in lh fm {;
	/*COMMUNITY-BASED EXPERIMENT*/
	/*imputing 0/1 for missing items*/
	forval i=0/1 {;
		/*without control for baseline outcome*/
		qui areg ztheta_`y'_endline_imp`i' communitybased if purecontrol==1|communitybased==1, 
			a(region2) cluster(settlement_code);
		outreg communitybased using `output'\gambia_ecd_edcc_table5_irt, se 3aster nocons `replace';
		
		/*with control for baseline outcome*/
		qui replace ztheta_`y'_baseline_imp`i'=. if in_baseline==0|in_endline_new==1;
		qui areg ztheta_`y'_endline_imp`i' communitybased ztheta_`y'_baseline_imp`i' 
			if purecontrol==1|communitybased==1, a(region2) cluster(settlement_code);
		outreg communitybased using `output'\gambia_ecd_edcc_table5_irt, se 3aster nocons `replace';
	};
	
	/*complete cases*/
	/*without control for baseline outcome*/
	qui areg ztheta_`y'_endline communitybased if purecontrol==1|communitybased==1, 
		a(region2) cluster(settlement_code);
	outreg communitybased using `output'\gambia_ecd_edcc_table5_irt, se 3aster nocons `replace';
	
	/*with control for baseline outcome*/
	qui replace ztheta_`y'_baseline=. if in_baseline==0|in_endline_new==1;
	qui areg ztheta_`y'_endline communitybased ztheta_`y'_baseline
		if purecontrol==1|communitybased==1, a(region2) cluster(settlement_code);
	outreg communitybased using `output'\gambia_ecd_edcc_table5_irt, se 3aster nocons `replace';

	/*top-coded at +/-3*/
	/*without control for baseline outcome*/
	qui areg ztheta_`y'_endline_tc communitybased if purecontrol==1|communitybased==1, 
		a(region2) cluster(settlement_code);
	outreg communitybased using `output'\gambia_ecd_edcc_table5_irt, se 3aster nocons `replace';
	
	/*with control for baseline outcome*/
	qui replace ztheta_`y'_baseline_tc=. if in_baseline==0|in_endline_new==1;
	qui areg ztheta_`y'_endline_tc communitybased ztheta_`y'_baseline_tc
		if purecontrol==1|communitybased==1, a(region2) cluster(settlement_code);
	outreg communitybased using `output'\gambia_ecd_edcc_table5_irt, se 3aster nocons `replace';
	
	/*ECD ANNEX EXPERIMENT*/
	/*imputing 0/1 for missing items*/
	forval i=0/1 {;
		/*without control for baseline outcome*/
		qui areg ztheta_`y'_endline_imp`i' ECDAnnex_treated if ECDAnnex_control==1|ECDAnnex_treated==1, 
			a(region2) cluster(settlement_code);
		outreg ECDAnnex_treated using `output'\gambia_ecd_edcc_table5_irt, se 3aster nocons `replace';
		
		/*with control for baseline outcome*/
		qui replace ztheta_`y'_baseline_imp`i'=. if in_baseline==0|in_endline_new==1;
		qui areg ztheta_`y'_endline_imp`i' ECDAnnex_treated ztheta_`y'_baseline_imp`i' 
			if ECDAnnex_control==1|ECDAnnex_treated==1, a(region2) cluster(settlement_code);
		outreg ECDAnnex_treated using `output'\gambia_ecd_edcc_table5_irt, se 3aster nocons `replace';
	};
	
	/*complete cases*/
	/*without control for baseline outcome*/
	qui areg ztheta_`y'_endline ECDAnnex_treated if ECDAnnex_control==1|ECDAnnex_treated==1, 
		a(region2) cluster(settlement_code);
	outreg ECDAnnex_treated using `output'\gambia_ecd_edcc_table5_irt, se 3aster nocons `replace';
	
	/*with control for baseline outcome*/
	qui replace ztheta_`y'_baseline=. if in_baseline==0|in_endline_new==1;
	qui areg ztheta_`y'_endline ECDAnnex_treated ztheta_`y'_baseline 
		if ECDAnnex_control==1|ECDAnnex_treated==1, a(region2) cluster(settlement_code);
	outreg ECDAnnex_treated using `output'\gambia_ecd_edcc_table5_irt, se 3aster nocons `replace';

	/*top-coded at +/-3*/
	/*without control for baseline outcome*/
	qui areg ztheta_`y'_endline_tc ECDAnnex_treated if ECDAnnex_control==1|ECDAnnex_treated==1, 
		a(region2) cluster(settlement_code);
	outreg ECDAnnex_treated using `output'\gambia_ecd_edcc_table5_irt, se 3aster nocons `replace';
	
	/*with control for baseline outcome*/
	qui replace ztheta_`y'_baseline_tc=. if in_baseline==0|in_endline_new==1;
	qui areg ztheta_`y'_endline_tc ECDAnnex_treated ztheta_`y'_baseline_tc
		if ECDAnnex_control==1|ECDAnnex_treated==1, a(region2) cluster(settlement_code);
	outreg ECDAnnex_treated using `output'\gambia_ecd_edcc_table5_irt, se 3aster nocons `replace';
};

local end=`"$S_TIME"'; 
di "`start'";
di "`end'";
log close;
